## OGR data source with driver: ESRI Shapefile 
## Source: "/home/dflynn-volpe/workingdata", layer: "hexagons_1mi_routes_AADT_total_sum"
## with 5201 features
## It has 4 fields
## OGR data source with driver: ESRI Shapefile 
## Source: "/home/dflynn-volpe/workingdata", layer: "hexagons_1mi_routes_sum"
## with 8111 features
## It has 3 fields
## OGR data source with driver: ESRI Shapefile 
## Source: "/home/dflynn-volpe/workingdata", layer: "hexagons_1mi_bg_lodes_sum"
## with 33184 features
## It has 32 fields
## OGR data source with driver: ESRI Shapefile 
## Source: "/home/dflynn-volpe/workingdata", layer: "hexagons_1mi_bg_rac_sum"
## with 33184 features
## It has 32 fields

Exploring imbalance

Actual sample size imbalance:

kable(w.04_09 %>% group_by(MatchEDT_buffer) %>% count(),  caption = "Count of EDT crash presence/absence over April-September 2017, Maryland")
Count of EDT crash presence/absence over April-September 2017, Maryland
MatchEDT_buffer n
0 1903966
1 99425

Imbalance has a temporal component, with more Waze records and more matching EDT crashes occurring in commuting times.

# long format using dplyr
hr.1 <- w.04_09 %>%
  group_by(hour, MatchEDT_buffer) %>%
  summarise(n = n())  %>%
  mutate(freq = round(100 * n / sum(n), 2))

kable(hr.1, caption = "Percent of EDT crash presence/absence records by hour over April-September 2017, Maryland") 
Percent of EDT crash presence/absence records by hour over April-September 2017, Maryland
hour MatchEDT_buffer n freq
0 0 30518 96.74
0 1 1029 3.26
1 0 23108 97.15
1 1 679 2.85
2 0 18995 97.33
2 1 521 2.67
3 0 18458 97.63
3 1 449 2.37
4 0 24427 97.54
4 1 616 2.46
5 0 41959 97.38
5 1 1130 2.62
6 0 70404 96.56
6 1 2511 3.44
7 0 107983 95.54
7 1 5044 4.46
8 0 120367 94.94
8 1 6420 5.06
9 0 109740 95.12
9 1 5625 4.88
10 0 100484 95.84
10 1 4365 4.16
11 0 102299 95.97
11 1 4298 4.03
12 0 105513 95.58
12 1 4874 4.42
13 0 107929 95.15
13 1 5506 4.85
14 0 113201 94.49
14 1 6606 5.51
15 0 125570 93.64
15 1 8523 6.36
16 0 136768 93.22
16 1 9941 6.78
17 0 140513 93.25
17 1 10167 6.75
18 0 116014 93.86
18 1 7592 6.14
19 0 85069 94.78
19 1 4687 5.22
20 0 66471 95.65
20 1 3025 4.35
21 0 55378 95.87
21 1 2386 4.13
22 0 46033 95.90
22 1 1967 4.10
23 0 36765 96.17
23 1 1464 3.83

Minimum and maximum imbalance are shown on the figure.

hr.1$text <- paste("Hour:", hr.1$hour, "\n EDT crash:", hr.1$MatchEDT_buffer,
                   "\n Frequency:", round(hr.1$freq, 2))

gp <- ggplot(hr.1, aes(x = hour, 
                       y = freq, 
                       group = MatchEDT_buffer, 
                       text = text)) +
 # geom_point() +
  ylab("Frequency") + xlab("Hour of Day") + theme_bw() + 
  ggtitle("Class imbalance by time") +
  geom_step(aes(color = MatchEDT_buffer), lwd = 2) 
  
# gp 

maxval <- hr.1 %>% 
  group_by(MatchEDT_buffer) %>% 
  summarise(max.freq = max(freq), 
            hr.max = hour[which(freq == max.freq)],
            min.freq = min(freq),
            hr.min= hour[which(freq == min.freq)])

plot.freq = c(maxval$max.freq, maxval$min.freq) + c(-3, +3, -3, +3)

gp2 <- gp + annotate("text", 
              x = c(maxval$hr.max, maxval$hr.min), 
              y = plot.freq, 
              label = round(c(maxval$max.freq, maxval$min.freq), 2)) + 
  guides(color=guide_legend(title="EDT crash class"))
# gp2

ggplotly(gp2, tooltip = "text") %>% layout(hovermode = 'x')
load(file.path(localdir, "Output_to_30_week"))
load(file.path(localdir, "Outputs_strata_test"))


tabl <- vector()
mods = c(names(keyoutputs)[4], names(keyoutputs_strata))

for(i in 1:length(mods)){ 
  if(i == 1){
    tabl <- rbind(tabl, c(100*keyoutputs[[mods[i]]]$diag, round(keyoutputs[[mods[i]]]$auc, 4)))
  } else {
   tabl <- rbind(tabl, c(100*keyoutputs_strata[[mods[i]]]$diag, round(as.numeric(keyoutputs_strata[[mods[i]]]$auc), 4)))
  }
  }

tabl <- as.data.frame(tabl)
colnames(tabl)[1:5] = c("Accuracy", "Precision", "Recall", "False Positive Rate", "AUC")
tabl$cutoff = c(.2, .5, .5, .5, .2, .2, .2)
tabl$sampling = c("None", "10:1", "5:1", "1:1", "10:1", "5:1", "1:1")
rownames(tabl) = mods
datatable(tabl, filter = 'top',
          caption = "Testing stratification and cutoffs for Model 30",
          rownames = T,
          options = list(dom = "t",
                         pageLength = length(mods))
          ) %>% formatCurrency(2:4, currency = "", digits = 2) %>%
          formatStyle(1, background = styleEqual(max(tabl[,1]), 'lightgreen'))%>%
          formatStyle(2, background = styleEqual(max(tabl[,2]), 'lightgreen'))%>%
          formatStyle(3, background = styleEqual(max(tabl[,3]), 'lightgreen'))%>%
          formatStyle(4, background = styleEqual(min(tabl[,4]), 'lightgreen'))%>%
          formatStyle(5, background = styleEqual(max(tabl[,5]), 'lightgreen'))

Variable importance:

s3load(object = file.path(outputdir, paste("Model_30_original_RandomForest_Output.RData", sep= "_")), bucket = waze.bucket)
rf.out.og <- rf.out

s3load(object = file.path(outputdir, "Stratification_test_RandomForest_Output.RData"), bucket = waze.bucket)

par(mfrow=c(2,2), mar = c(5,3, 3, 1))
varImpPlot(rf.out.og,
           main = "Model 30 Variable Importance \n Original",
           n.var = 15, 
           cex = 0.6)
varImpPlot(rf.30_10,
           main = "Model 30 Variable Importance \n 10:1",
           n.var = 15, 
           cex = 0.6)
varImpPlot(rf.30_05,
           main = "Model 30 Variable Importance \n 5:1",
           n.var = 15, 
           cex = 0.6)
varImpPlot(rf.30_01,
           main = "Model 30 Variable Importance \n 1:1",
           n.var = 15, 
           cex = 0.6)